#include "nonbonded.h"
#ifdef __sw_slave__
#include <qthread_slave.h>
#endif
enum inv_sqrt_mode_t {
  SQRT_FIRST,
  INV_FIRST,
  SOFTWARE
};
#include <simd.h>
#include "esmd_types.h"
#include "cell.h"
#include "swarch.h"
typedef struct cpe_nonbonded_param {
  cellgrid_t *grid;
  nonbonded_param_t *p;
  mdstat_t *stat;
} cpe_nonbonded_param_t;
#ifdef __sw_host__
#include <qthread.h>
struct CoulRf;
struct VdwLJSwitched;
template<int>
struct CoulMSM;
struct CoulShifted;
template <class, class, int, inv_sqrt_mode_t mode = SQRT_FIRST> extern void slave_nonbonded_force_vec_iterator (cpe_nonbonded_param_t *);
template<class, class, int, inv_sqrt_mode_t mode=SQRT_FIRST>
extern void slave_nonbonded_force_vec_iterator_v2(cpe_nonbonded_param_t *);

extern "C" {

extern void slave_nonbonded_force_vec_lj_rf_cpe(void *);
void nonbonded_force_lj_rf_sw(cellgrid_t *grid, nonbonded_param_t *p, mdstat_t *stat) {
  cpe_nonbonded_param_t param;
  param.grid = grid;
  param.p = p;
  param.stat = stat;
  qthread_spawn(slave_nonbonded_force_vec_iterator_v2<VdwLJSwitched, CoulRf, 1, SOFTWARE>, &param);
  // qthread_spawn(slave_nonbonded_force_vec_lj_rf_cpe, &param);
  qthread_flush_host();
  // timer_start(SCATTER_MPE);
  // gather_x_host(grid);
  // timer_stop(SCATTER_MPE);
  qthread_join();
}
extern void slave_nonbonded_force_vec_lj_shifted_cpe(void *);
void nonbonded_force_lj_shifted_sw(cellgrid_t *grid, nonbonded_param_t *p, mdstat_t *stat) {
  cpe_nonbonded_param_t param;
  param.grid = grid;
  param.p = p;
  param.stat = stat;

  qthread_spawn(slave_nonbonded_force_vec_lj_shifted_cpe, &param);
  qthread_flush_host();
  qthread_join();
}

// extern void slave_nonbonded_force_vec_lj_msm4_cpe(void *);
// extern void slave_nonbonded_force_vec_lj_msm6_cpe(void *);
// extern void slave_nonbonded_force_vec_lj_msm8_cpe(void *);
// extern void slave_nonbonded_force_vec_lj_msm10_cpe(void *);
// extern void slave_nonbonded_force_vec_lj_msm12_cpe(void *);

void nonbonded_force_lj_msm_sw(cellgrid_t *grid, nonbonded_param_t *p, mdstat_t *stat) {
  cpe_nonbonded_param_t param;
  param.grid = grid;
  param.p = p;
  param.stat = stat;
  switch (p->order) {
  case 4:
    qthread_spawn(slave_nonbonded_force_vec_iterator<VdwLJSwitched, CoulMSM<4>, 1>, &param);
    break;
  case 6:
    qthread_spawn(slave_nonbonded_force_vec_iterator<VdwLJSwitched, CoulMSM<6>, 1>, &param);
    break;
  case 8:
    qthread_spawn(slave_nonbonded_force_vec_iterator<VdwLJSwitched, CoulMSM<8>, 1>, &param);
    break;
  case 10:
    qthread_spawn(slave_nonbonded_force_vec_iterator<VdwLJSwitched, CoulMSM<10>, 1>, &param);
    break;
  case 12:
    qthread_spawn(slave_nonbonded_force_vec_iterator<VdwLJSwitched, CoulMSM<12>, 1>, &param);
    break;
  }
  qthread_flush_host();
  qthread_join();
}
}
#endif
#ifdef __sw_slave__
// #ifdef OPEN64
// #define DMA_FAST
// #define _SW_COMPILER_VERSION
// #include <dma.h>
// #endif
#include "simd_cpp.hpp"
extern "C" {

#include "dma_macros_new.h"
#include "cal.h"
#include "reg_reduce.h"
extern void simd_print_int512(vint);
#define LWPF_UNIT U(NONB)
#define LWPF_KERNELS \
  K(INIT)            \
  K(COMPUTE)         \
  K(FINI)            \
  K(DMA_RI)          \
  K(DMA_RJ)          \
  K(DMA_WJ)          \
  K(ILOOP)           \
  K(JLOOP)           \
  K(SECT_GLD)        \
  K(INIT_MASK)
#define EVT_PC0 PC0_CYC_LD_ST_BLOCK
#define EVT_PC1 PC1_CYC_DATA_REL_BLOCK
#define EVT_PC2 PC2_CNT_ICACHE_MISS
#define EVT_PC3 PC3_CYCLE
#include "lwpf3/lwpf.h"
// #include "lwpf3/lwpf_disable.h"

// #define max(a, b) ((a) > (b) ? (a) : (b))
// #define min(a, b) ((a) < (b) ? (a) : (b))
}
INLINE real cell2box(vec<real> *o, vec<real> *h, vec<real> *x, int natom) {
  vec<real> xmin, xmax;
  vecset1(xmin, 0);
  vecset1(xmax, 0);
  for (int i = 0; i < natom; i++) {
    vecminv(xmin, xmin, x[i]);
    vecmaxv(xmax, xmax, x[i]);
  }
  vecscaleaddv(*o, xmin, xmax, 0.5, 0.5);
  vecsubv(*h, xmax, *o);
}
__always_inline vreal vvdist2_atom_box(vreal xv4, vreal yv4, vreal zv4, vreal oxv4, vreal oyv4, vreal ozv4, vreal hxv4, vreal hyv4, vreal hzv4) {
  vreal f0v4 = v_set1d(0.0);
  vreal vxv4 = v_cpys(f0v4, xv4 - oxv4);
  vreal vyv4 = v_cpys(f0v4, yv4 - oyv4);
  vreal vzv4 = v_cpys(f0v4, zv4 - ozv4);

  vreal upxv4 = vxv4 - hxv4;
  vreal upyv4 = vyv4 - hyv4;
  vreal upzv4 = vzv4 - hzv4;
  vreal uxv4 = v_sellt(upxv4, f0v4, upxv4);
  vreal uyv4 = v_sellt(upyv4, f0v4, upyv4);
  vreal uzv4 = v_sellt(upzv4, f0v4, upzv4);
  return uxv4 * uxv4 + uyv4 * uyv4 + uzv4 * uzv4;
}

struct VdwLJSwitched {
  static constexpr bool need_sqrt = true, has_scale14 = true;
  const vreal f0v4, f1v4, f2v4, f3v4, f12v4;
  vreal rsw2v4, rcut2v4, inv_denom_switchv4;
  __always_inline VdwLJSwitched(const nonbonded_param_t &pm) : f0v4(v_set1d(0)),
                                                               f1v4(v_set1d(1)),
                                                               f2v4(v_set1d(2)),
                                                               f3v4(v_set1d(3)),
                                                               f12v4(v_set1d(12)) {
    this->rsw2v4 = v_set1d(pm.rsw * pm.rsw);
    this->rcut2v4 = v_set1d(pm.rcut * pm.rcut);
    vreal denom_switchv4 = this->rcut2v4 - this->rsw2v4;
    this->inv_denom_switchv4 = this->f1v4 / (denom_switchv4 * denom_switchv4 * denom_switchv4);
  }
  template <int need_ene>
  __always_inline void compute(vreal &ev4, vreal &fv4,
                               vreal rv4, vreal r2v4, vreal inv_rv4, vreal inv_r2v4,
                               vreal epsv4, vreal sigv4) const {
    vreal sig_r2v4 = sigv4 * sigv4 * inv_r2v4;
    vreal sig_r6v4 = sig_r2v4 * sig_r2v4 * sig_r2v4;
    vreal sig_r12v4 = sig_r6v4 * sig_r6v4;

    fv4 = this->f12v4 * (sig_r6v4 - sig_r12v4) * epsv4 * inv_r2v4;

    vreal gtswv4 = v_fcmpgt(r2v4, this->rsw2v4);
    int has_gtsw = v_any(gtswv4);
    vreal sv4, dsdrv4;
    if (need_ene || has_gtsw) {
      ev4 = (sig_r12v4 - this->f2v4 * sig_r6v4) * epsv4;
    }
    if (has_gtsw) {
      sv4 = (this->rcut2v4 - r2v4) * (this->rcut2v4 - r2v4) * (this->rcut2v4 + this->f2v4 * r2v4 - this->f3v4 * this->rsw2v4) * this->inv_denom_switchv4;
      dsdrv4 = this->f12v4 * (this->rcut2v4 - r2v4) * (this->rsw2v4 - r2v4) * this->inv_denom_switchv4;

      sv4 = v_selgt(gtswv4, sv4, this->f1v4);
      dsdrv4 = v_selgt(gtswv4, dsdrv4, this->f0v4);
      fv4 = ev4 * dsdrv4 + fv4 * sv4;
      if (need_ene)
        ev4 = ev4 * sv4;
    }
    fv4 = -fv4;
  }
};
struct CoulShifted {
  static constexpr bool need_sqrt = true, has_long = false;
  vreal inv_rcut2v4, f1v4, f4v4;
  __always_inline CoulShifted(const nonbonded_param_t &pm) {
    inv_rcut2v4 = v_set1d(1.0 / (pm.rcut * pm.rcut));
    f1v4 = v_set1d(1.0);
    f4v4 = v_set1d(4.0);
  }
  template <int need_ene>
  __always_inline void compute(vreal &ev4, vreal &fv4,
                               vreal rv4, vreal r2v4, vreal inv_rv4, vreal inv_r2v4,
                               vreal kqqv4, vreal coul_factor) const {
    ev4 = kqqv4 * inv_rv4 * coul_factor;
    fv4 = ev4 * inv_r2v4;
    vreal sv4, dsdrv4;
    vreal sqrtsv4 = (this->f1v4 - r2v4 * this->inv_rcut2v4);
    sv4 = sqrtsv4 * sqrtsv4;
    dsdrv4 = this->f4v4 * this->inv_rcut2v4 * sqrtsv4;

    fv4 = ev4 * dsdrv4 + fv4 * sv4;
    ev4 = ev4 * sv4;
  }
};
struct CoulRf {
  static constexpr bool need_sqrt = true, has_long = false;
  vreal k_rfv4, c_rfv4;
  const vreal f2v4;
  __always_inline CoulRf(const nonbonded_param_t &pm) : f2v4(v_set1d(2.0)){
    real eps_rf = 78.5;
    real eps_r = 1;
    real k_rf = (eps_rf - eps_r) / ((2 * eps_rf + eps_r) * pm.rcut * pm.rcut * pm.rcut);
    real c_rf = (3 * eps_rf) / ((2 * eps_rf + eps_r) * pm.rcut);
    this->k_rfv4 = v_set1d(k_rf);
    this->c_rfv4 = v_set1d(c_rf);
  }
  template <int need_ene>
  __always_inline void compute(vreal &ev4, vreal &fv4,
                               vreal rv4, vreal r2v4, vreal inv_rv4, vreal inv_r2v4,
                               vreal kqqv4, vreal coul_factor) const {
    if (need_ene) {
      ev4 = kqqv4 * (inv_rv4 + this->k_rfv4 * r2v4 - this->c_rfv4) * coul_factor;
    }
    fv4 = kqqv4 * (inv_rv4 - this->f2v4 * this->k_rfv4 * r2v4) * inv_r2v4 * coul_factor;
  }
};
template <const int order>
struct CoulMSM {
  static constexpr bool has_long = true, need_sqrt = true;
  vreal gcons[order / 2 + 1];
  vreal dgcons[order / 2];
  vreal inv_rcutv4, inv_rcut2v4;
  vreal f1v4;
  __always_inline CoulMSM(const nonbonded_param_t &pm) {
    real rcut = pm.rcut;
    this->inv_rcutv4 = v_set1d(1.0 / rcut);
    this->inv_rcut2v4 = v_set1d(1.0 / (rcut * rcut));
    this->f1v4 = v_set1d(1.0);
    if (order == 4) {
      this->gcons[0] = v_set1d(15.0 / 8.0);
      this->gcons[1] = v_set1d(-5.0 / 4.0);
      this->gcons[2] = v_set1d(3.0 / 8.0);
    }
    if (order == 6) {
      this->gcons[0] = v_set1d(35.0 / 16.0);
      this->gcons[1] = v_set1d(-35.0 / 16.0);
      this->gcons[2] = v_set1d(21.0 / 16.0);
      this->gcons[3] = v_set1d(-5.0 / 16.0);
    }
    if (order == 8) {
      this->gcons[0] = v_set1d(315.0 / 128.0);
      this->gcons[1] = v_set1d(-105.0 / 32.0);
      this->gcons[2] = v_set1d(189.0 / 64.0);
      this->gcons[3] = v_set1d(-45.0 / 32.0);
      this->gcons[4] = v_set1d(35.0 / 128.0);
    }
    if (order == 10) {
      this->gcons[0] = v_set1d(693.0 / 256.0);
      this->gcons[1] = v_set1d(-1155.0 / 256.0);
      this->gcons[2] = v_set1d(693.0 / 128.0);
      this->gcons[3] = v_set1d(-495.0 / 128.0);
      this->gcons[4] = v_set1d(385.0 / 256.0);
      this->gcons[5] = v_set1d(-63.0 / 256.0);
    }
    if (order == 12) {
      this->gcons[0] = v_set1d(3003.0 / 1024.0);
      this->gcons[1] = v_set1d(-3003.0 / 512.0);
      this->gcons[2] = v_set1d(9009.0 / 1024.0);
      this->gcons[3] = v_set1d(-2145.0 / 256.0);
      this->gcons[4] = v_set1d(5005.0 / 1024.0);
      this->gcons[5] = v_set1d(-819.0 / 512.0);
      this->gcons[6] = v_set1d(231.0 / 1024.0);
    }

    if (order == 4) {
      this->dgcons[0] = v_set1d(-5.0 / 2.0);
      this->dgcons[1] = v_set1d(3.0 / 2.0);
    }
    if (order == 6) {
      this->dgcons[0] = v_set1d(-35.0 / 8.0);
      this->dgcons[1] = v_set1d(21.0 / 4.0);
      this->dgcons[2] = v_set1d(-15.0 / 8.0);
    }
    if (order == 8) {
      this->dgcons[0] = v_set1d(-105.0 / 16.0);
      this->dgcons[1] = v_set1d(189.0 / 16.0);
      this->dgcons[2] = v_set1d(-135.0 / 16.0);
      this->dgcons[3] = v_set1d(35.0 / 16.0);
    }
    if (order == 10) {
      this->dgcons[0] = v_set1d(-1155.0 / 128.0);
      this->dgcons[1] = v_set1d(693.0 / 32.0);
      this->dgcons[2] = v_set1d(-1485.0 / 64.0);
      this->dgcons[3] = v_set1d(385.0 / 32.0);
      this->dgcons[4] = v_set1d(-315.0 / 128.0);
    }
    if (order == 12) {
      this->dgcons[0] = v_set1d(-3003.0 / 256.0);
      this->dgcons[1] = v_set1d(9009.0 / 256.0);
      this->dgcons[2] = v_set1d(-6435.0 / 128.0);
      this->dgcons[3] = v_set1d(5005.0 / 128.0);
      this->dgcons[4] = v_set1d(-4095.0 / 256.0);
      this->dgcons[5] = v_set1d(693.0 / 256.0);
    }
  }
  template <int need_ene>
  __always_inline void compute(vreal &ev4, vreal &fv4,
                               vreal rv4, vreal r2v4, vreal inv_rv4, vreal inv_r2v4,
                               vreal kqqv4, vreal coul_factor) const {
    vreal rhov4 = rv4 * this->inv_rcutv4;
    vreal rho2v4 = r2v4 * this->inv_rcut2v4;
    vreal egammav4 = coul_factor - rhov4 * msm_gammav4(rho2v4);
    vreal fgammav4 = coul_factor + rho2v4 * msm_dgammav4(rhov4, rho2v4);

    vreal u_origv4 = kqqv4 * inv_rv4;
    vreal dudr_origv4 = u_origv4 * inv_r2v4;

    ev4 = u_origv4 * egammav4;
    fv4 = dudr_origv4 * fgammav4;
  }

  __always_inline vreal msm_gammav4(vreal rho2) const {
    vreal g = this->gcons[order / 2];
    for (int i = order / 2 - 1; i >= 0; i--) {
      g = g * rho2 + this->gcons[i];
    }
    return g;
  }

  __always_inline vreal msm_dgammav4(vreal rho, vreal rho2) const {
    vreal dg = this->dgcons[order / 2 - 1];
    for (int i = order / 2 - 2; i >= 0; i--) {
      dg = dg * rho2 + this->dgcons[i];
    }
    return dg * rho;
  }
};

template <int MASKNI>
__attribute__((always_inline)) inline void fill_bitmask(long (*mask)[MASKNI], int (*list)[2], int len) {
  for (int ie = 0; ie < len; ie++) {
    int i = list[ie][0];
    int j = list[ie][1];
    //cal_locked_printf("%d %d\n", i, j);
    mask[j][i >> 6] |= 1L << (i & 63);
  }
}
template <typename T>
__attribute__((always_inline)) inline void zero_fill(T *arr, int len) {
  long *larr = (long *)arr;
  vint v0 = v_set1l(0);
  for (int i = 0; i < len; i += 4) {
    v_stl(larr + i, v0);
  }
}
#include "gcd_lcm.hpp"
template <int MASKNI>
__attribute__((always_inline)) inline void repeat_bitmask(long (*mask)[MASKNI], long *from, int len) {
  constexpr int nseed = lcm<MASKNI, VEC_WIDTH>::value;
  constexpr int nrepeat = nseed / MASKNI;
  constexpr int nvector = nseed;
  long seeds[nseed];
  for (int i = 0; i < nrepeat; i ++) {
    for (int j = 0; j < MASKNI; j ++) {
      seeds[i * MASKNI + j] = from[j];
    }
  }
  long *plain_mask = (long*)mask;
  for (int i = 0; i < len; i += nrepeat) {
    #pragma GCC unroll(VEC_WIDTH)
    for (int j = 0; j < nseed; j += VEC_WIDTH) {
      vint tmp = v_ldl(seeds + j);
      v_stl(plain_mask + i * MASKNI + j, tmp);
    }
  }
}
template <int MASKNI>
__attribute__((always_inline)) inline void copy_bitmask(long (*mask)[MASKNI], long (*from)[MASKNI], int len) {
  
  long *plain_mask = (long*)mask;
  long *plain_from = (long*)from;
  int plain_count = len * MASKNI;
  for (int i = 0; i < plain_count; i += VEC_WIDTH) {
    vint tmp = v_ldl(plain_from + i);
    v_stl(plain_mask + i, tmp);
  }
}
#include "unroll.hpp"
template <class vdwl_cls, class coul_cls, int need_ene = 0, inv_sqrt_mode_t inv_sqrt_mode = SQRT_FIRST>
void nonbonded_force_vec_iterator(cpe_nonbonded_param_t *pm) {
  lwpf_enter(NONB);
  lwpf_start(INIT);

  dma_init();
  cpe_nonbonded_param_t lpm;
  pe_get(pm, &lpm, sizeof(cpe_nonbonded_param_t));
  dma_syn();
  nonbonded_param_t lp;
  cellgrid_t lgrid;
  mdstat_t lstat;
  pe_get(lpm.p, &lp, sizeof(nonbonded_param_t));
  pe_get(lpm.grid, &lgrid, sizeof(cellgrid_t));
  if (_MYID == 0)
    pe_get(lpm.stat, &lstat, sizeof(mdstat_t));
  dma_syn();

  sw_archdata_t archdata;
  pe_get(lgrid.arch_data, &archdata, sizeof(sw_archdata_t));
  dma_syn();
  vdw_param_t vdw_param[64];
  pe_get(lp.vdw_param, vdw_param, sizeof(vdw_param_t) * lp.ntypes);
  dma_syn();
  vec<real> fi[CELL_CAP];

  lwpf_stop(INIT);

  lwpf_start(COMPUTE);
  int nncell = hdcell(lgrid.nn, 0, 0, 0);
  vdwl_cls vdwl(lp);
  coul_cls coul(lp);

  vreal f0v4 = v_set1d(0.0);
  vreal f1v4 = v_set1d(1.0);
  vreal evdwlv4 = f0v4, ecoulv4 = f0v4;
  real coul_const = lp.coul_const;
  vreal coul_constv4 = v_set1d(lp.coul_const);
  vreal rcut2v4 = v_set1d(lp.rcut * lp.rcut);
  vreal scale14v4 = v_set1d(lp.scale14);

  vreal masks[1 << VEC_WIDTH];
  for (int i = 0; i < (1 << VEC_WIDTH); i++) {
    masks[i] = f0v4;
    real m0 = (i >> 0) & 1;
    real m1 = (i >> 1) & 1;
    real m2 = (i >> 2) & 1;
    real m3 = (i >> 3) & 1;
    masks[i] = f1v4 - v_setd(m0, m1, m2, m3);
  }
  constexpr int maskni = (CELL_CAP + 63) >> 6;

  long iovf_mask_ii[CELL_CAP][maskni];
  for (int j = 0; j < CELL_CAP; j++) {
    int ist = j >> 6;
    iovf_mask_ii[j][ist] = -1L << (j & 63);
    for (int i = ist + 1; i < maskni; i++) {
      iovf_mask_ii[j][i] = -1L;
    }
  }
  //for (auto icell : lgrid.local_cells<SERIAL, PARALLEL_RR, PARALEL_LOADBALANCE, PARALLEL_DYN>()){
  //   auto [xi, ti, qi] = copy_in(icell->x, icell->t, icell->q);
  //   for (auto jcell: icell.half_shell(2)){
  //
  FOREACH_LOCAL_CELL(&lgrid, ii, jj, kk, icell) {
    int cellid = getcid(&lgrid, ii, jj, kk);
    if (!(archdata.pe_range[_MYID].st <= cellid && archdata.pe_range[_MYID].ed > cellid))
      continue;

    long cpe_mask = (1L << _MYID) - 1;
    cellmeta_t imeta;
    lwpf_start(DMA_RI);
    pe_get(&(icell->basis), &imeta, sizeof(cellmeta_t));
    dma_syn();
    vec<real> xi[CELL_CAP];
    real qi[CELL_CAP];
    int ti[CELL_CAP];
    int first_scal_cell[MAX_NEIGH];
    int first_excl_cell[MAX_NEIGH];
    int excl_id[MAX_SCAL_CELL][2];
    int scal_id[MAX_EXCL_CELL][2];
    pe_get(icell->x, xi, sizeof(vec<real>) * imeta.natom);
    dma_syn();
    pe_get(icell->t, ti, sizeof(int) * imeta.natom);
    pe_get(icell->q, qi, sizeof(real) * imeta.natom);
    pe_get(icell->first_excl_cell, first_excl_cell, sizeof(int) * MAX_NEIGH);
    pe_get(icell->first_scal_cell, first_scal_cell, sizeof(int) * MAX_NEIGH);

    vec<real> oi, hi;
    cell2box(&oi, &hi, xi, imeta.natom);

    dma_syn();
    int nexcl = first_excl_cell[nncell + 1];
    int nscal = first_scal_cell[nncell + 1];
    if (nexcl)
      pe_get(icell->excl_id, excl_id, sizeof(int) * 2 * nexcl);
    if (nscal)
      pe_get(icell->scal_id, scal_id, sizeof(int) * 2 * nscal);
    real xit[CELL_CAP], yit[CELL_CAP], zit[CELL_CAP];
    real fxit[CELL_CAP], fyit[CELL_CAP], fzit[CELL_CAP];
    real epsi[CELL_CAP], sigi[CELL_CAP], epsi14[CELL_CAP], sigi14[CELL_CAP];
    for (int i = 0; i < imeta.natom; i++) {
      epsi[i] = vdw_param[ti[i]].eps;
      sigi[i] = vdw_param[ti[i]].sig;
      epsi14[i] = vdw_param[ti[i]].eps14;
      sigi14[i] = vdw_param[ti[i]].sig14;
      xit[i] = xi[i].x;
      yit[i] = xi[i].y;
      zit[i] = xi[i].z;
    }
    zero_fill(fxit, imeta.natom);
    zero_fill(fyit, imeta.natom);
    zero_fill(fzit, imeta.natom);
    for (int i = imeta.natom; i < imeta.natom + 4; i++) {
      xit[i] = 0;
      yit[i] = 0;
      zit[i] = 0;
    }
    long iovf_mask_ij[maskni];
    int ist = imeta.natom >> 6;
    long iovf = -1L << (imeta.natom & 63);
    for (int i = 0; i < ist; i ++) iovf_mask_ij[i] = 0;
    iovf_mask_ij[ist] = -1L << (imeta.natom & 63);
    for (int i = ist + 1; i < maskni; i ++) iovf_mask_ij[i] = -1L;
    dma_syn();
    lwpf_stop(DMA_RI);

    FOREACH_NEIGHBOR_HALF_SHELL(&lgrid, ii, jj, kk, dx, dy, dz, jcell) {
      cellmeta_t jmeta;
      /*this is a marker that indicates the most recent i atom has excl/scal with index j*/
      // float is_mask[CELL_CAP];
      /* excl_mask[j] >> i -> j is excluded with i*/
      long excl_mask[CELL_CAP][maskni];
      long scal_mask[CELL_CAP][maskni];
      long iovf_mask[CELL_CAP][maskni];
      pe_get(&(jcell->basis), &jmeta, sizeof(cellmeta_t));
      int did = hdcell(lgrid.nn, dx, dy, dz);

      lwpf_start(DMA_RJ);

      dma_syn();

      int far = abs(dx) + abs(dy) + abs(dz) > 1;
      int jcell_id = jcell - lgrid.cells;
      vec<real> dcell;
      vecsubv(dcell, jmeta.basis, imeta.basis);

      int ie = first_excl_cell[did];
      int is = first_scal_cell[did];

      vec<real> xj[CELL_CAP];
      vec<real> fj[CELL_CAP];
      int tj[CELL_CAP];
      real qj[CELL_CAP];

      pe_get(jcell->x, xj, sizeof(vec<real>) * jmeta.natom);
      // lwpf_start(INIT_MASK);
      zero_fill(scal_mask, jmeta.natom * maskni);
      if (coul.has_long)
        zero_fill(iovf_mask, jmeta.natom * maskni);

      // lwpf_stop(INIT_MASK);

      // lwpf_start(SECT_GLD);

      if (!coul.has_long) {
        if (icell == jcell) {
          copy_bitmask(excl_mask, iovf_mask_ii, jmeta.natom);
        } else {
          repeat_bitmask(excl_mask, iovf_mask_ij, jmeta.natom);
        }
      } else {
        if (icell == jcell) {
          copy_bitmask(iovf_mask, iovf_mask_ii, jmeta.natom);
        } else {
          repeat_bitmask(iovf_mask, iovf_mask_ij, jmeta.natom);
        }
      }

      fill_bitmask(excl_mask, excl_id + first_excl_cell[did], first_excl_cell[did + 1] - first_excl_cell[did]);
      fill_bitmask(scal_mask, scal_id + first_scal_cell[did], first_scal_cell[did + 1] - first_scal_cell[did]);
      dma_syn();
      pe_get(jcell->t, tj, sizeof(int) * jmeta.natom);
      pe_get(jcell->q, qj, sizeof(real) * jmeta.natom);
      int irep = __builtin_popcountl(jmeta.pe_mask & cpe_mask) - 1;
      if (irep == -1) {
        pe_get(jcell->f, fj, sizeof(vec<real>) * jmeta.natom);
      } else {
        if (imeta.rep_init_mask & (1L << did)) {
          // zero_fill(fj, jmeta.natom * 3);
          for (int j = 0; j < jmeta.natom; j++) {
            vecset1(fj[j], 0);
          }
        } else {
          pe_get(archdata.freps[jmeta.first_frep + irep].f, fj, sizeof(vec<real>) * jmeta.natom);
        }
      }
      vreal dcellxv4 = v_set1d(dcell.x), dcellyv4 = v_set1d(dcell.y), dcellzv4 = v_set1d(dcell.z);
      realint jskip[CELL_CAP], nj = 0;
      if (far) {
        vreal oxv4 = v_set1d(oi.x - dcell.x);
        vreal oyv4 = v_set1d(oi.y - dcell.y);
        vreal ozv4 = v_set1d(oi.z - dcell.z);
        vreal hxv4 = v_set1d(hi.x);
        vreal hyv4 = v_set1d(hi.y);
        vreal hzv4 = v_set1d(hi.z);
        for (int j = 0; j < jmeta.natom; j += VEC_WIDTH) {
          vreal xv4, yv4, zv4;
          simd_load_xyzvec(xv4, yv4, zv4, (real *)(xj + j));
          vreal dist2v4 = vvdist2_atom_box(xv4, yv4, zv4, oxv4, oyv4, ozv4, hxv4, hyv4, hzv4);
          vreal skipv4 = v_fcmplt(rcut2v4, dist2v4);
          v_std((real *)jskip + j, skipv4);
        }
      } else {
        zero_fill(jskip, jmeta.natom);
        // for (int j = 0; j < jmeta.natom; j++) {
        //   jskip[j] = 0;
        // }
      }

      // lwpf_stop(SECT_GLD);
      dma_syn();

      lwpf_stop(DMA_RJ);
      lwpf_start(JLOOP);
      for (int j = 0; j < jmeta.natom; j++) {
        if (jskip[j])
          continue;
        vec<real> xneigh;
        vecaddv(xneigh, dcell, xj[j]);

        int ni = imeta.natom;
        if (icell == jcell) {
          ni = j;
        }

        // vreal xjv4 = v_set1d(xneigh.x);
        // vreal yjv4 = v_set1d(xneigh.y);
        // vreal zjv4 = v_set1d(xneigh.z);
        real kqjv4 = qj[j] * coul_const;
        // real qneigh = qj[j];
        vreal sigjv4 = v_set1d(vdw_param[tj[j]].sig);
        vreal epsjv4 = v_set1d(vdw_param[tj[j]].eps);
        vreal sigj14v4 = v_set1d(vdw_param[tj[j]].sig14);
        vreal epsj14v4 = v_set1d(vdw_param[tj[j]].eps14);
        vreal fxjv4 = f0v4, fyjv4 = f0v4, fzjv4 = f0v4;
        lwpf_start(ILOOP);
        for (int i = 0; i < ni; i += VEC_WIDTH) {
          int emask = excl_mask[j][i >> 6] >> (i & 63) & (1 << VEC_WIDTH) - 1;
          int smask = scal_mask[j][i >> 6] >> (i & 63) & (1 << VEC_WIDTH) - 1;
          vreal exclv4 = masks[emask];
          vreal scalv4 = masks[smask];

          vreal xiv4, yiv4, ziv4;
          xiv4 = v_ldd(xit + i);
          yiv4 = v_ldd(yit + i);
          ziv4 = v_ldd(zit + i);
          vreal dxv4 = xiv4 - xneigh.x;
          vreal dyv4 = yiv4 - xneigh.y;
          vreal dzv4 = ziv4 - xneigh.z;
          vreal r2v4 = dxv4 * dxv4 + dyv4 * dyv4 + dzv4 * dzv4;
          vreal ltcutv4 = v_fcmplt(r2v4, rcut2v4);
          if (!coul.has_long) {
            ltcutv4 = v_seleq(exclv4, f0v4, ltcutv4);
          } else {
            vreal maskv4 = masks[iovf_mask[j][i >> 6] >> (i & 63) & (1 << VEC_WIDTH) - 1];
            ltcutv4 = v_seleq(maskv4, f0v4, ltcutv4);
          }
          int has_ltcut = v_any(ltcutv4);

          if (has_ltcut) {
            vreal inv_r2v4, inv_rv4, rv4;
            if (coul.need_sqrt || vdwl.need_sqrt) {
              if (inv_sqrt_mode == SQRT_FIRST) {
                rv4 = v_sqrt(r2v4);
                inv_r2v4 = f1v4 / r2v4;
                inv_rv4 = rv4 * inv_r2v4;
              }
              if (inv_sqrt_mode == INV_FIRST) {
                inv_r2v4 = f1v4 / r2v4;
                inv_rv4 = v_sqrt(inv_r2v4);
                rv4 = inv_rv4 * r2v4;
              }
              if (inv_sqrt_mode == SOFTWARE) {
                inv_rv4 = v_invsqrt(r2v4);
                inv_r2v4 = inv_rv4 * inv_rv4;
                rv4 = inv_rv4 * r2v4;
              }
            } else {
              inv_r2v4 = f1v4 / r2v4;
            }

            vreal qiv4 = v_ldd(qi + i);
            // vreal qqv4 = qneigh * qiv4;
            // vreal kqqv4 = qqv4 * coul_constv4;
            vreal kqqv4 = kqjv4 * qiv4;
            vreal dwdrv4, dudrv4, wv4, uv4;

            vreal sigi14v4, epsi14v4, sigiv4, epsiv4;
            sigi14v4 = v_ldd(sigi14 + i);
            sigiv4 = v_ldd(sigi + i);
            epsi14v4 = v_ldd(epsi14 + i);
            epsiv4 = v_ldd(epsi + i);

            vreal epsi_usev4 = v_seleq(scalv4, epsi14v4, epsiv4);
            vreal epsj_usev4 = v_seleq(scalv4, epsj14v4, epsjv4);
            vreal sigi_usev4 = v_seleq(scalv4, sigi14v4, sigiv4);
            vreal sigj_usev4 = v_seleq(scalv4, sigj14v4, sigjv4);
            vreal coul_factorv4 = v_seleq(scalv4, scale14v4, f1v4);
            if (coul.has_long) {
              coul_factorv4 = v_seleq(exclv4, f0v4, coul_factorv4);
            }
            vreal epsv4 = epsi_usev4 * epsj_usev4; //simd_vseleq(scalv4, eps14v4, epsxxv4);
            vreal sigv4 = sigi_usev4 + sigj_usev4; //simd_vseleq(scalv4, sig14v4, sigxxv4);

            vdwl.template compute<need_ene>(wv4, dwdrv4, rv4, r2v4, inv_rv4, inv_r2v4, epsv4, sigv4);
            if (_MYID == 13 && v_any(scalv4)){
              double s4[4], w4[4], dwdr4[4];
              v_std(s4, scalv4);
              v_std(w4, wv4);
              v_std(dwdr4, dwdrv4);
              for (int ii = 0; ii < 4; ii ++){
                if (s4[ii] != 0) {
                  printf("%d %d %d %d %f %f\n", i+ii, j, icell->tag[i+ii], jcell->tag[j], w4[ii], dwdr4[ii]);
                }
              }
              //simd_print_doublev4(wv4);
            }
            coul.template compute<need_ene>(uv4, dudrv4, rv4, r2v4, inv_rv4, inv_r2v4, kqqv4, coul_factorv4);

            uv4 = v_selgt(ltcutv4, uv4, f0v4);
            dudrv4 = v_selgt(ltcutv4, dudrv4, f0v4);
            if (need_ene)
            wv4 = v_selgt(ltcutv4, wv4, f0v4);
            if (coul.has_long) {
              if (need_ene)
              wv4 = v_seleq(exclv4, f0v4, wv4);
              dwdrv4 = v_seleq(exclv4, f0v4, dwdrv4);
            }
            vreal fpairv4 = dwdrv4 + dudrv4;

            fpairv4 = v_selgt(ltcutv4, fpairv4, f0v4);

            vreal fxiv4, fyiv4, fziv4;
            fxiv4 = v_ldd(fxit + i);
            fyiv4 = v_ldd(fyit + i);
            fziv4 = v_ldd(fzit + i);

            fxjv4 -= fpairv4 * dxv4;
            fyjv4 -= fpairv4 * dyv4;
            fzjv4 -= fpairv4 * dzv4;

            fxiv4 += fpairv4 * dxv4;
            fyiv4 += fpairv4 * dyv4;
            fziv4 += fpairv4 * dzv4;

            v_std(fxit + i, fxiv4);
            v_std(fyit + i, fyiv4);
            v_std(fzit + i, fziv4);
            if (need_ene) {
              evdwlv4 += wv4;
              ecoulv4 += uv4;
            }
          }
        }

        lwpf_stop(ILOOP);
        vec<real> fjj;
        fjj.x = v_sum(fxjv4);
        fjj.y = v_sum(fyjv4);
        fjj.z = v_sum(fzjv4);

        vecaddv(fj[j], fj[j], fjj);
      }

      lwpf_stop(JLOOP);
      if (icell == jcell) {
        for (int i = 0; i < jmeta.natom; i++) {
          fj[i].x += fxit[i];
          fj[i].y += fyit[i];
          fj[i].z += fzit[i];
        }
      }

      lwpf_start(DMA_WJ);
      if (irep == -1) {
        pe_put(jcell->f, fj, sizeof(vec<real>) * jmeta.natom);
      } else {
        pe_put(archdata.freps[jmeta.first_frep + irep].f, fj, sizeof(vec<real>) * jmeta.natom);
      }
      dma_syn();
      lwpf_stop(DMA_WJ);
    }
  }
  lwpf_stop(COMPUTE);
  qthread_syn();
  //if (_MYID == 13) exit(0);
  FOREACH_CELL_CPE_RR(&lgrid, ii, jj, kk, cell) {
    cellmeta_t meta;
    pe_get(&(cell->basis), &meta, sizeof(cellmeta_t));
    dma_syn();
    vec<real> f[CELL_CAP];
    pe_get(cell->f, f, sizeof(vec<real>) * meta.natom);
    dma_syn();

    int nrep_cell = __builtin_popcountl(meta.pe_mask) - 1;
    for (int irep = 0; irep < nrep_cell; irep++) {
      vec<real> frep[CELL_CAP];
      pe_get(archdata.freps[meta.first_frep + irep].f, frep, sizeof(vec<real>) * meta.natom);
      dma_syn();
      for (int i = 0; i < meta.natom; i++) {
        vecaddv(f[i], f[i], frep[i]);
      }
    }
    pe_put(cell->f, f, sizeof(vec<real>) * meta.natom);
    dma_syn();
  }
  lwpf_start(FINI);

  double energy[4];

  energy[0] = v_sum(evdwlv4);
  energy[1] = v_sum(ecoulv4);

  reg_reduce_inplace_doublev4(energy, 1);

  if (_MYID == 0) {
    lstat.evdwl += energy[0];
    lstat.ecoul += energy[1];
    pe_put(lpm.stat, &lstat, sizeof(mdstat_t));
    dma_syn();
  }
  lwpf_stop(FINI);
  lwpf_exit(NONB);
}


template <class vdwl_cls, class coul_cls, int need_ene = 0, inv_sqrt_mode_t inv_sqrt_mode = SQRT_FIRST>
void nonbonded_force_vec_iterator_v2(cpe_nonbonded_param_t *pm) {
  lwpf_enter(NONB);
  lwpf_start(INIT);

  dma_init();
  cpe_nonbonded_param_t lpm;
  pe_get(pm, &lpm, sizeof(cpe_nonbonded_param_t));
  dma_syn();
  nonbonded_param_t lp;
  cellgrid_t lgrid;
  mdstat_t lstat;
  pe_get(lpm.p, &lp, sizeof(nonbonded_param_t));
  pe_get(lpm.grid, &lgrid, sizeof(cellgrid_t));
  if (_MYID == 0)
    pe_get(lpm.stat, &lstat, sizeof(mdstat_t));
  dma_syn();

  sw_archdata_t archdata;
  pe_get(lgrid.arch_data, &archdata, sizeof(sw_archdata_t));
  dma_syn();
  vdw_param_t vdw_param[64];
  pe_get(lp.vdw_param, vdw_param, sizeof(vdw_param_t) * lp.ntypes);
  dma_syn();
  vec<real> fi[CELL_CAP];

  lwpf_stop(INIT);

  lwpf_start(COMPUTE);
  int nncell = hdcell(lgrid.nn, 0, 0, 0);
  vdwl_cls vdwl(lp);
  coul_cls coul(lp);

  vreal f0v4 = v_set1d(0.0);
  vreal f1v4 = v_set1d(1.0);
  vreal evdwlv4 = f0v4, ecoulv4 = f0v4;
  real coul_const = lp.coul_const;
  vreal coul_constv4 = v_set1d(lp.coul_const);
  vreal rcut2v4 = v_set1d(lp.rcut * lp.rcut);
  vreal scale14v4 = v_set1d(lp.scale14);

  vreal masks[1 << VEC_WIDTH];
  for (int i = 0; i < (1 << VEC_WIDTH); i++) {
    masks[i] = f0v4;
    real m0 = (i >> 0) & 1;
    real m1 = (i >> 1) & 1;
    real m2 = (i >> 2) & 1;
    real m3 = (i >> 3) & 1;
    masks[i] = f1v4 - v_setd(m0, m1, m2, m3);
  }
  constexpr int maskni = (CELL_CAP + 63) >> 6;

  long iovf_mask_ii[CELL_CAP][maskni];
  for (int j = 0; j < CELL_CAP; j++) {
    int ist = j >> 6;
    iovf_mask_ii[j][ist] = -1L << (j & 63);
    for (int i = ist + 1; i < maskni; i++) {
      iovf_mask_ii[j][i] = -1L;
    }
  }
  FOREACH_LOCAL_CELL(&lgrid, ii, jj, kk, icell) {
    int cellid = getcid(&lgrid, ii, jj, kk);
    if (!(archdata.pe_range[_MYID].st <= cellid && archdata.pe_range[_MYID].ed > cellid))
      continue;

    long cpe_mask = (1L << _MYID) - 1;
    cellmeta_t imeta;
    lwpf_start(DMA_RI);
    pe_get(&(icell->basis), &imeta, sizeof(cellmeta_t));
    dma_syn();
    vec<real> xi[CELL_CAP];
    real qi[CELL_CAP];
    int ti[CELL_CAP];
    int first_scal_cell[MAX_NEIGH];
    int first_excl_cell[MAX_NEIGH];
    int excl_id[MAX_SCAL_CELL][2];
    int scal_id[MAX_EXCL_CELL][2];
    pe_get(icell->x, xi, sizeof(vec<real>) * imeta.natom);
    dma_syn();
    pe_get(icell->t, ti, sizeof(int) * imeta.natom);
    pe_get(icell->q, qi, sizeof(real) * imeta.natom);
    pe_get(icell->first_excl_cell, first_excl_cell, sizeof(int) * MAX_NEIGH);
    pe_get(icell->first_scal_cell, first_scal_cell, sizeof(int) * MAX_NEIGH);

    vec<real> oi, hi;
    cell2box(&oi, &hi, xi, imeta.natom);

    dma_syn();
    int nexcl = first_excl_cell[nncell + 1];
    int nscal = first_scal_cell[nncell + 1];
    if (nexcl)
      pe_get(icell->excl_id, excl_id, sizeof(int) * 2 * nexcl);
    if (nscal)
      pe_get(icell->scal_id, scal_id, sizeof(int) * 2 * nscal);
    real xit[CELL_CAP], yit[CELL_CAP], zit[CELL_CAP];
    real fxit[CELL_CAP], fyit[CELL_CAP], fzit[CELL_CAP];
    real epsi[CELL_CAP], sigi[CELL_CAP], epsi14[CELL_CAP], sigi14[CELL_CAP];
    for (int i = 0; i < imeta.natom; i++) {
      epsi[i] = vdw_param[ti[i]].eps;
      sigi[i] = vdw_param[ti[i]].sig;
      epsi14[i] = vdw_param[ti[i]].eps14;
      sigi14[i] = vdw_param[ti[i]].sig14;
      xit[i] = xi[i].x;
      yit[i] = xi[i].y;
      zit[i] = xi[i].z;
    }
    zero_fill(fxit, imeta.natom);
    zero_fill(fyit, imeta.natom);
    zero_fill(fzit, imeta.natom);
    for (int i = imeta.natom; i < imeta.natom + 4; i++) {
      xit[i] = 0;
      yit[i] = 0;
      zit[i] = 0;
    }
    long iovf_mask_ij[maskni];
    int ist = imeta.natom >> 6;
    long iovf = -1L << (imeta.natom & 63);
    for (int i = 0; i < ist; i ++) iovf_mask_ij[i] = 0;
    iovf_mask_ij[ist] = -1L << (imeta.natom & 63);
    for (int i = ist + 1; i < maskni; i ++) iovf_mask_ij[i] = -1L;
    dma_syn();
    lwpf_stop(DMA_RI);

    FOREACH_NEIGHBOR_HALF_SHELL(&lgrid, ii, jj, kk, dx, dy, dz, jcell) {
      cellmeta_t jmeta;
      /*this is a marker that indicates the most recent i atom has excl/scal with index j*/
      // float is_mask[CELL_CAP];
      /* excl_mask[j] >> i -> j is excluded with i*/
      long excl_mask[CELL_CAP][maskni];
      long scal_mask[CELL_CAP][maskni];
      long iovf_mask[CELL_CAP][maskni];
      pe_get(&(jcell->basis), &jmeta, sizeof(cellmeta_t));
      int did = hdcell(lgrid.nn, dx, dy, dz);

      lwpf_start(DMA_RJ);

      dma_syn();

      int far = abs(dx) + abs(dy) + abs(dz) > 1;
      int jcell_id = jcell - lgrid.cells;
      vec<real> dcell;
      vecsubv(dcell, jmeta.basis, imeta.basis);

      int ie = first_excl_cell[did];
      int is = first_scal_cell[did];

      vec<real> xj[CELL_CAP];
      vec<real> fj[CELL_CAP];
      int tj[CELL_CAP];
      real qj[CELL_CAP];

      pe_get(jcell->x, xj, sizeof(vec<real>) * jmeta.natom);
      // lwpf_start(INIT_MASK);
      zero_fill(scal_mask, jmeta.natom * maskni);
      if (coul.has_long)
        zero_fill(iovf_mask, jmeta.natom * maskni);

      // lwpf_stop(INIT_MASK);

      // lwpf_start(SECT_GLD);

      if (!coul.has_long) {
        if (icell == jcell) {
          copy_bitmask(excl_mask, iovf_mask_ii, jmeta.natom);
        } else {
          repeat_bitmask(excl_mask, iovf_mask_ij, jmeta.natom);
        }
      } else {
        if (icell == jcell) {
          copy_bitmask(iovf_mask, iovf_mask_ii, jmeta.natom);
        } else {
          repeat_bitmask(iovf_mask, iovf_mask_ij, jmeta.natom);
        }
      }

      fill_bitmask(excl_mask, excl_id + first_excl_cell[did], first_excl_cell[did + 1] - first_excl_cell[did]);
      fill_bitmask(excl_mask, scal_id + first_scal_cell[did], first_scal_cell[did + 1] - first_scal_cell[did]);
      dma_syn();
      pe_get(jcell->t, tj, sizeof(int) * jmeta.natom);
      pe_get(jcell->q, qj, sizeof(real) * jmeta.natom);
      int irep = __builtin_popcountl(jmeta.pe_mask & cpe_mask) - 1;
      if (irep == -1) {
        pe_get(jcell->f, fj, sizeof(vec<real>) * jmeta.natom);
      } else {
        if (imeta.rep_init_mask & (1L << did)) {
          // zero_fill(fj, jmeta.natom * 3);
          for (int j = 0; j < jmeta.natom; j++) {
            vecset1(fj[j], 0);
          }
        } else {
          pe_get(archdata.freps[jmeta.first_frep + irep].f, fj, sizeof(vec<real>) * jmeta.natom);
        }
      }
      vreal dcellxv4 = v_set1d(dcell.x), dcellyv4 = v_set1d(dcell.y), dcellzv4 = v_set1d(dcell.z);
      realint jskip[CELL_CAP], nj = 0;
      if (far) {
        vreal oxv4 = v_set1d(oi.x - dcell.x);
        vreal oyv4 = v_set1d(oi.y - dcell.y);
        vreal ozv4 = v_set1d(oi.z - dcell.z);
        vreal hxv4 = v_set1d(hi.x);
        vreal hyv4 = v_set1d(hi.y);
        vreal hzv4 = v_set1d(hi.z);
        for (int j = 0; j < jmeta.natom; j += VEC_WIDTH) {
          vreal xv4, yv4, zv4;
          simd_load_xyzvec(xv4, yv4, zv4, (real *)(xj + j));
          vreal dist2v4 = vvdist2_atom_box(xv4, yv4, zv4, oxv4, oyv4, ozv4, hxv4, hyv4, hzv4);
          vreal skipv4 = v_fcmplt(rcut2v4, dist2v4);
          v_std((real *)jskip + j, skipv4);
        }
      } else {
        zero_fill(jskip, jmeta.natom);
        // for (int j = 0; j < jmeta.natom; j++) {
        //   jskip[j] = 0;
        // }
      }

      // lwpf_stop(SECT_GLD);
      dma_syn();

      lwpf_stop(DMA_RJ);
      lwpf_start(JLOOP);
      for (int j = 0; j < jmeta.natom; j++) {
        if (jskip[j])
          continue;
        vec<real> xneigh;
        vecaddv(xneigh, dcell, xj[j]);

        int ni = imeta.natom;
        if (icell == jcell) {
          ni = j;
        }

        // vreal xjv4 = v_set1d(xneigh.x);
        // vreal yjv4 = v_set1d(xneigh.y);
        // vreal zjv4 = v_set1d(xneigh.z);
        real kqjv4 = qj[j] * coul_const;
        // real qneigh = qj[j];
        vreal sigjv4 = v_set1d(vdw_param[tj[j]].sig);
        vreal epsjv4 = v_set1d(vdw_param[tj[j]].eps);
        //vreal sigj14v4 = v_set1d(vdw_param[tj[j]].sig14);
        //vreal epsj14v4 = v_set1d(vdw_param[tj[j]].eps14);
        vreal fxjv4 = f0v4, fyjv4 = f0v4, fzjv4 = f0v4;
        lwpf_start(ILOOP);
        for (int i = 0; i < ni; i += VEC_WIDTH) {
          int emask = excl_mask[j][i >> 6] >> (i & 63) & (1 << VEC_WIDTH) - 1;
          //int smask = scal_mask[j][i >> 6] >> (i & 63) & (1 << VEC_WIDTH) - 1;
          vreal exclv4 = masks[emask];
          //vreal scalv4 = masks[smask];

          vreal xiv4, yiv4, ziv4;
          xiv4 = v_ldd(xit + i);
          yiv4 = v_ldd(yit + i);
          ziv4 = v_ldd(zit + i);
          vreal dxv4 = xiv4 - xneigh.x;
          vreal dyv4 = yiv4 - xneigh.y;
          vreal dzv4 = ziv4 - xneigh.z;
          vreal r2v4 = dxv4 * dxv4 + dyv4 * dyv4 + dzv4 * dzv4;
          vreal ltcutv4 = v_fcmplt(r2v4, rcut2v4);
          if (!coul.has_long) {
            ltcutv4 = v_seleq(exclv4, f0v4, ltcutv4);
          } else {
            vreal maskv4 = masks[iovf_mask[j][i >> 6] >> (i & 63) & (1 << VEC_WIDTH) - 1];
            ltcutv4 = v_seleq(maskv4, f0v4, ltcutv4);
          }
          int has_ltcut = v_any(ltcutv4);

          if (has_ltcut) {
            vreal inv_r2v4, inv_rv4, rv4;
            if (coul.need_sqrt || vdwl.need_sqrt) {
              if (inv_sqrt_mode == SQRT_FIRST) {
                rv4 = v_sqrt(r2v4);
                inv_r2v4 = f1v4 / r2v4;
                inv_rv4 = rv4 * inv_r2v4;
              }
              if (inv_sqrt_mode == INV_FIRST) {
                inv_r2v4 = f1v4 / r2v4;
                inv_rv4 = v_sqrt(inv_r2v4);
                rv4 = inv_rv4 * r2v4;
              }
              if (inv_sqrt_mode == SOFTWARE) {
                inv_rv4 = v_invsqrt(r2v4);
                inv_r2v4 = inv_rv4 * inv_rv4;
                rv4 = inv_rv4 * r2v4;
              }
            } else {
              inv_r2v4 = f1v4 / r2v4;
            }

            vreal qiv4 = v_ldd(qi + i);
            vreal kqqv4 = kqjv4 * qiv4;
            vreal dwdrv4, dudrv4, wv4, uv4;

            vreal sigi14v4, epsi14v4, sigiv4, epsiv4;

            sigiv4 = v_ldd(sigi + i);
            epsiv4 = v_ldd(epsi + i);

            vreal coul_factorv4 = f1v4;
            if (coul.has_long) {
              coul_factorv4 = v_seleq(exclv4, f0v4, coul_factorv4);
            }
            vreal epsv4 = epsiv4 * epsjv4; //simd_vseleq(scalv4, eps14v4, epsxxv4);
            vreal sigv4 = sigiv4 + sigjv4; //simd_vseleq(scalv4, sig14v4, sigxxv4);

            vdwl.template compute<need_ene>(wv4, dwdrv4, rv4, r2v4, inv_rv4, inv_r2v4, epsv4, sigv4);
            coul.template compute<need_ene>(uv4, dudrv4, rv4, r2v4, inv_rv4, inv_r2v4, kqqv4, coul_factorv4);

            uv4 = v_selgt(ltcutv4, uv4, f0v4);
            dudrv4 = v_selgt(ltcutv4, dudrv4, f0v4);
            if (need_ene)
            wv4 = v_selgt(ltcutv4, wv4, f0v4);
            if (coul.has_long) {
              if (need_ene)
              wv4 = v_seleq(exclv4, f0v4, wv4);
              dwdrv4 = v_seleq(exclv4, f0v4, dwdrv4);
            }
            vreal fpairv4 = dwdrv4 + dudrv4;

            fpairv4 = v_selgt(ltcutv4, fpairv4, f0v4);

            vreal fxiv4, fyiv4, fziv4;
            fxiv4 = v_ldd(fxit + i);
            fyiv4 = v_ldd(fyit + i);
            fziv4 = v_ldd(fzit + i);

            fxjv4 -= fpairv4 * dxv4;
            fyjv4 -= fpairv4 * dyv4;
            fzjv4 -= fpairv4 * dzv4;

            fxiv4 += fpairv4 * dxv4;
            fyiv4 += fpairv4 * dyv4;
            fziv4 += fpairv4 * dzv4;

            v_std(fxit + i, fxiv4);
            v_std(fyit + i, fyiv4);
            v_std(fzit + i, fziv4);
            if (need_ene) {
              evdwlv4 += wv4;
              ecoulv4 += uv4;
            }
          }
        }

        lwpf_stop(ILOOP);
        vec<real> fjj;
        fjj.x = v_sum(fxjv4);
        fjj.y = v_sum(fyjv4);
        fjj.z = v_sum(fzjv4);

        vecaddv(fj[j], fj[j], fjj);
      }
      lwpf_stop(JLOOP);
      for (int is = first_scal_cell[did]; is < first_scal_cell[did + 1]; is ++){
        int i = scal_id[is][0];
        int j = scal_id[is][1];
        vec<real> rij = xi[i] - (xj[j] + dcell);
        vreal r2 = v_set1d(rij.norm2());
        
        vreal rinv = f1v4 / v_sqrt(r2);
        vreal r = r2 * rinv;
        vreal r2inv = rinv * rinv;
        vreal sig = v_set1d(vdw_param[ti[i]].sig14 + vdw_param[tj[j]].sig14);
        vreal eps = v_set1d(vdw_param[ti[i]].eps14 * vdw_param[tj[j]].eps14);
        vreal kqq = v_set1d(qi[i] * qj[j] * coul_const);
        vreal wv4, dwdrv4, uv4, dudrv4;
        vdwl.template compute<need_ene>(wv4, dwdrv4, r, r2, rinv, r2inv, eps, sig);
        coul.template compute<need_ene>(uv4, dudrv4, r, r2, rinv, r2inv, kqq, scale14v4);

        real fpair = v_extf0(dwdrv4 + dudrv4);
        fxit[i] += rij.x * fpair;
        fyit[i] += rij.y * fpair;
        fzit[i] += rij.z * fpair;
        fj[j] -= fpair * rij;
        ecoulv4 += v_set1d(v_extf0(uv4) * 0.25);
        evdwlv4 += v_set1d(v_extf0(wv4) * 0.25);
      }
      if (icell == jcell) {
        for (int i = 0; i < jmeta.natom; i++) {
          fj[i].x += fxit[i];
          fj[i].y += fyit[i];
          fj[i].z += fzit[i];
        }
      }

      lwpf_start(DMA_WJ);
      if (irep == -1) {
        pe_put(jcell->f, fj, sizeof(vec<real>) * jmeta.natom);
      } else {
        pe_put(archdata.freps[jmeta.first_frep + irep].f, fj, sizeof(vec<real>) * jmeta.natom);
      }
      dma_syn();
      lwpf_stop(DMA_WJ);
    }
  }
  lwpf_stop(COMPUTE);
  qthread_syn();
  //if (_MYID == 13) exit(0);
  FOREACH_CELL_CPE_RR(&lgrid, ii, jj, kk, cell) {
    cellmeta_t meta;
    pe_get(&(cell->basis), &meta, sizeof(cellmeta_t));
    dma_syn();
    vec<real> f[CELL_CAP];
    pe_get(cell->f, f, sizeof(vec<real>) * meta.natom);
    dma_syn();

    int nrep_cell = __builtin_popcountl(meta.pe_mask) - 1;
    for (int irep = 0; irep < nrep_cell; irep++) {
      vec<real> frep[CELL_CAP];
      pe_get(archdata.freps[meta.first_frep + irep].f, frep, sizeof(vec<real>) * meta.natom);
      dma_syn();
      for (int i = 0; i < meta.natom; i++) {
        vecaddv(f[i], f[i], frep[i]);
      }
    }
    pe_put(cell->f, f, sizeof(vec<real>) * meta.natom);
    dma_syn();
  }
  lwpf_start(FINI);

  double energy[4];

  energy[0] = v_sum(evdwlv4);
  energy[1] = v_sum(ecoulv4);

  reg_reduce_inplace_doublev4(energy, 1);

  if (_MYID == 0) {
    lstat.evdwl += energy[0];
    lstat.ecoul += energy[1];
    pe_put(lpm.stat, &lstat, sizeof(mdstat_t));
    dma_syn();
  }
  lwpf_stop(FINI);
  lwpf_exit(NONB);
}

extern "C" {
void slave_nonbonded_force_vec_lj_shifted_cpe(cpe_nonbonded_param_t *pm) {
  nonbonded_force_vec_iterator<VdwLJSwitched, CoulShifted, 1>(pm);
}

void slave_nonbonded_force_vec_lj_rf_cpe(cpe_nonbonded_param_t *pm) {
  nonbonded_force_vec_iterator<VdwLJSwitched, CoulRf, 1, SQRT_FIRST>(pm);
}
  void slave_nonbonded_force_vec_lj_rf_cpe2(cpe_nonbonded_param_t *pm) {
  nonbonded_force_vec_iterator_v2<VdwLJSwitched, CoulRf, 1, SOFTWARE>(pm);
}
void slave_nonbonded_force_vec_lj_msm4_cpe(cpe_nonbonded_param_t *pm) {
  nonbonded_force_vec_iterator<VdwLJSwitched, CoulMSM<4>, 1>(pm);
}
void slave_nonbonded_force_vec_lj_msm6_cpe(cpe_nonbonded_param_t *pm) {
  nonbonded_force_vec_iterator<VdwLJSwitched, CoulMSM<6>, 1>(pm);
}
void slave_nonbonded_force_vec_lj_msm8_cpe(cpe_nonbonded_param_t *pm) {
  nonbonded_force_vec_iterator<VdwLJSwitched, CoulMSM<8>, 1>(pm);
}
void slave_nonbonded_force_vec_lj_msm10_cpe(cpe_nonbonded_param_t *pm) {
  nonbonded_force_vec_iterator<VdwLJSwitched, CoulMSM<10>, 1>(pm);
}
void slave_nonbonded_force_vec_lj_msm12_cpe(cpe_nonbonded_param_t *pm) {
  nonbonded_force_vec_iterator<VdwLJSwitched, CoulMSM<12>, 1>(pm);
}
}
#endif
